#!/usr/local/bin/python
#-*- encoding:utf-8 -*- 
 
from whoosh.index import create_in  
from whoosh.fields import *  
from chinesetokenizer import ChineseAnalyzer
import utils

config = utils.get_config();
analyzer = ChineseAnalyzer()
f = formats.Frequency();
schema = Schema(appid = ID(stored=True), content=TEXT(stored=True, analyzer=analyzer,vector=f))  

ix = create_in(config.get('bm25', 'clickTitleIndex'), schema)  

writer = ix.writer()
f = open(config.get('bm25', 'clickCorpus'));
for line in f:
    
    li = line.split('\t');
    #print li;

    if len(li) != 4:
        continue;
    writer.add_document(appid=li[0].decode('utf-8'),content=li[3].decode('utf-8'));  
writer.commit()

f.close();

